knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
options(repos = c(CRAN = "https://cran.stat.auckland.ac.nz/"))
required_pkgs <- c(
"dplyr", "tidyr", "readr", "tibble", "janitor", "skimr",
"recipes", "rsample", "tidymodels", "rpart", "kernlab",
"FactoMineR", "factoextra", "caret" , "purrr", "plotly",
"RColorBrewer","glue"
)
to_install <- setdiff(required_pkgs, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(required_pkgs, library, character.only = TRUE))
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
##
## step
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ purrr 1.0.4
## ✔ dials 1.4.0 ✔ tune 1.3.0
## ✔ ggplot2 3.5.2 ✔ workflows 1.2.0
## ✔ infer 1.0.8 ✔ workflowsets 1.1.0
## ✔ modeldata 1.4.0 ✔ yardstick 1.3.2
## ✔ parsnip 1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
##
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
##
## prune
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## alpha
## The following object is masked from 'package:dials':
##
## buffer
## The following object is masked from 'package:scales':
##
## alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
##
## lift
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Data Load and Inspection
candidate <- c("../data/unzipped", "./data/unzipped", "data/unzipped")
data_dir <- candidate[file.exists(candidate)][1]
if (is.na(data_dir)) stop("Cannot find data folder.")
path_annotated <- file.path(data_dir,"seeds_annotated.csv")
path_unlabeled <- file.path(data_dir,"seeds.csv")
annotated <- read_csv(path_annotated, locale=locale(decimal_mark=","), show_col_types=FALSE)
unlabeled <- read_csv(path_unlabeled, locale=locale(decimal_mark=","), show_col_types=FALSE)
has_annotated_index <- "...1" %in% names(annotated)
has_unlabeled_index <- "...1" %in% names(unlabeled)
if (has_annotated_index) annotated <- select(annotated,-...1)
if (has_unlabeled_index) unlabeled <- select(unlabeled,-...1)
glimpse(unlabeled)
## Rows: 13,511
## Columns: 16
## $ Area <dbl> 28395, 28734, 29380, 30008, 30140, 30279, 30477, 30519…
## $ Perimeter <dbl> 610.291, 638.018, 624.110, 645.884, 620.134, 634.927, …
## $ MajorAxisLength <dbl> 208.1781, 200.5248, 212.8261, 210.5580, 201.8479, 212.…
## $ MinorAxisLength <dbl> 173.8887, 182.7344, 175.9311, 182.5165, 190.2793, 181.…
## $ AspectRation <dbl> 1.197191, 1.097356, 1.209713, 1.153638, 1.060798, 1.17…
## $ Eccentricity <dbl> 0.5498122, 0.4117853, 0.5627273, 0.4986160, 0.3336797,…
## $ ConvexArea <dbl> 28715, 29172, 29690, 30724, 30417, 30600, 30970, 30847…
## $ EquivDiameter <dbl> 190.1411, 191.2728, 193.4109, 195.4671, 195.8965, 196.…
## $ Extent <dbl> 0.7639225, 0.7839681, 0.7781132, 0.7826813, 0.7730980,…
## $ Solidity <dbl> 0.9888560, 0.9849856, 0.9895588, 0.9766957, 0.9908932,…
## $ roundness <dbl> 0.9580271, 0.8870336, 0.9478495, 0.9039364, 0.9848771,…
## $ Compactness <dbl> 0.9133578, 0.9538608, 0.9087742, 0.9283288, 0.9705155,…
## $ ShapeFactor1 <dbl> 0.007331506, 0.006978659, 0.007243912, 0.007016729, 0.…
## $ ShapeFactor2 <dbl> 0.003147289, 0.003563624, 0.003047733, 0.003214562, 0.…
## $ ShapeFactor3 <dbl> 0.8342224, 0.9098505, 0.8258706, 0.8617944, 0.9419004,…
## $ ShapeFactor4 <dbl> 0.9987239, 0.9984303, 0.9990661, 0.9941988, 0.9991661,…
glimpse(annotated)
## Rows: 100
## Columns: 17
## $ Area <dbl> 32708, 31590, 59344, 174621, 63096, 34550, 43880, 3307…
## $ Perimeter <dbl> 657.630, 649.120, 944.279, 1591.312, 1010.607, 742.826…
## $ MajorAxisLength <dbl> 233.1297, 235.0561, 355.4248, 590.5549, 402.1450, 269.…
## $ MinorAxisLength <dbl> 178.9160, 171.3433, 214.7311, 379.9938, 202.8656, 164.…
## $ AspectRation <dbl> 1.303012, 1.371843, 1.655208, 1.554117, 1.982322, 1.64…
## $ Eccentricity <dbl> 0.6411054, 0.6845706, 0.7968680, 0.7654864, 0.8634357,…
## $ ConvexArea <dbl> 33047, 31908, 60734, 177034, 64321, 35311, 44179, 3356…
## $ EquivDiameter <dbl> 204.0714, 200.5533, 274.8802, 471.5234, 283.4366, 209.…
## $ Extent <dbl> 0.7560795, 0.7523399, 0.7148932, 0.7663656, 0.7080528,…
## $ Solidity <dbl> 0.9897419, 0.9900338, 0.9771133, 0.9863698, 0.9809549,…
## $ roundness <dbl> 0.9503873, 0.9421271, 0.8363461, 0.8665541, 0.7763313,…
## $ Compactness <dbl> 0.8753555, 0.8532147, 0.7733850, 0.7984414, 0.7048119,…
## $ ShapeFactor1 <dbl> 0.007127605, 0.007440839, 0.005989229, 0.003381924, 0.…
## $ ShapeFactor2 <dbl> 0.002581435, 0.002432400, 0.001321702, 0.000847844, 0.…
## $ ShapeFactor3 <dbl> 0.7662472, 0.7279753, 0.5981244, 0.6375087, 0.4967599,…
## $ ShapeFactor4 <dbl> 0.9984291, 0.9986676, 0.9900206, 0.9907632, 0.9847381,…
## $ Class <chr> "D", "D", "C", "B", "E", "D", "F", "D", "F", "G", "D",…
names(annotated) <- trimws(names(annotated))
annotated <- mutate(annotated, Class=as.factor(Class))
annotated |> glimpse()
## Rows: 100
## Columns: 17
## $ Area <dbl> 32708, 31590, 59344, 174621, 63096, 34550, 43880, 3307…
## $ Perimeter <dbl> 657.630, 649.120, 944.279, 1591.312, 1010.607, 742.826…
## $ MajorAxisLength <dbl> 233.1297, 235.0561, 355.4248, 590.5549, 402.1450, 269.…
## $ MinorAxisLength <dbl> 178.9160, 171.3433, 214.7311, 379.9938, 202.8656, 164.…
## $ AspectRation <dbl> 1.303012, 1.371843, 1.655208, 1.554117, 1.982322, 1.64…
## $ Eccentricity <dbl> 0.6411054, 0.6845706, 0.7968680, 0.7654864, 0.8634357,…
## $ ConvexArea <dbl> 33047, 31908, 60734, 177034, 64321, 35311, 44179, 3356…
## $ EquivDiameter <dbl> 204.0714, 200.5533, 274.8802, 471.5234, 283.4366, 209.…
## $ Extent <dbl> 0.7560795, 0.7523399, 0.7148932, 0.7663656, 0.7080528,…
## $ Solidity <dbl> 0.9897419, 0.9900338, 0.9771133, 0.9863698, 0.9809549,…
## $ roundness <dbl> 0.9503873, 0.9421271, 0.8363461, 0.8665541, 0.7763313,…
## $ Compactness <dbl> 0.8753555, 0.8532147, 0.7733850, 0.7984414, 0.7048119,…
## $ ShapeFactor1 <dbl> 0.007127605, 0.007440839, 0.005989229, 0.003381924, 0.…
## $ ShapeFactor2 <dbl> 0.002581435, 0.002432400, 0.001321702, 0.000847844, 0.…
## $ ShapeFactor3 <dbl> 0.7662472, 0.7279753, 0.5981244, 0.6375087, 0.4967599,…
## $ ShapeFactor4 <dbl> 0.9984291, 0.9986676, 0.9900206, 0.9907632, 0.9847381,…
## $ Class <fct> D, D, C, B, E, D, F, D, F, G, D, F, G, E, C, G, F, G, …
head(annotated)
## # A tibble: 6 × 17
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 32708 658. 233. 179. 1.30 0.641
## 2 31590 649. 235. 171. 1.37 0.685
## 3 59344 944. 355. 215. 1.66 0.797
## 4 174621 1591. 591. 380. 1.55 0.765
## 5 63096 1011. 402. 203. 1.98 0.863
## 6 34550 743. 270. 164. 1.64 0.793
## # ℹ 11 more variables: ConvexArea <dbl>, EquivDiameter <dbl>, Extent <dbl>,
## # Solidity <dbl>, roundness <dbl>, Compactness <dbl>, ShapeFactor1 <dbl>,
## # ShapeFactor2 <dbl>, ShapeFactor3 <dbl>, ShapeFactor4 <dbl>, Class <fct>
cat("NAs in annotated:", sum(is.na(annotated)), "\n")
## NAs in annotated: 0
names(unlabeled) <- trimws(names(unlabeled))
unlabeled |> glimpse()
## Rows: 13,511
## Columns: 16
## $ Area <dbl> 28395, 28734, 29380, 30008, 30140, 30279, 30477, 30519…
## $ Perimeter <dbl> 610.291, 638.018, 624.110, 645.884, 620.134, 634.927, …
## $ MajorAxisLength <dbl> 208.1781, 200.5248, 212.8261, 210.5580, 201.8479, 212.…
## $ MinorAxisLength <dbl> 173.8887, 182.7344, 175.9311, 182.5165, 190.2793, 181.…
## $ AspectRation <dbl> 1.197191, 1.097356, 1.209713, 1.153638, 1.060798, 1.17…
## $ Eccentricity <dbl> 0.5498122, 0.4117853, 0.5627273, 0.4986160, 0.3336797,…
## $ ConvexArea <dbl> 28715, 29172, 29690, 30724, 30417, 30600, 30970, 30847…
## $ EquivDiameter <dbl> 190.1411, 191.2728, 193.4109, 195.4671, 195.8965, 196.…
## $ Extent <dbl> 0.7639225, 0.7839681, 0.7781132, 0.7826813, 0.7730980,…
## $ Solidity <dbl> 0.9888560, 0.9849856, 0.9895588, 0.9766957, 0.9908932,…
## $ roundness <dbl> 0.9580271, 0.8870336, 0.9478495, 0.9039364, 0.9848771,…
## $ Compactness <dbl> 0.9133578, 0.9538608, 0.9087742, 0.9283288, 0.9705155,…
## $ ShapeFactor1 <dbl> 0.007331506, 0.006978659, 0.007243912, 0.007016729, 0.…
## $ ShapeFactor2 <dbl> 0.003147289, 0.003563624, 0.003047733, 0.003214562, 0.…
## $ ShapeFactor3 <dbl> 0.8342224, 0.9098505, 0.8258706, 0.8617944, 0.9419004,…
## $ ShapeFactor4 <dbl> 0.9987239, 0.9984303, 0.9990661, 0.9941988, 0.9991661,…
head(unlabeled)
## # A tibble: 6 × 16
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 28395 610. 208. 174. 1.20 0.550
## 2 28734 638. 201. 183. 1.10 0.412
## 3 29380 624. 213. 176. 1.21 0.563
## 4 30008 646. 211. 183. 1.15 0.499
## 5 30140 620. 202. 190. 1.06 0.334
## 6 30279 635. 213. 182. 1.17 0.520
## # ℹ 10 more variables: ConvexArea <dbl>, EquivDiameter <dbl>, Extent <dbl>,
## # Solidity <dbl>, roundness <dbl>, Compactness <dbl>, ShapeFactor1 <dbl>,
## # ShapeFactor2 <dbl>, ShapeFactor3 <dbl>, ShapeFactor4 <dbl>
cat("NAs in unlabeled:", sum(is.na(unlabeled)), "\n")
## NAs in unlabeled: 0
3. Z-score Normalisation
unlabeled_scaled <- select(unlabeled, where(is.numeric)) |>
scale() |> as_tibble()
4. K‑means from Scratch
delta_to_centroid <- function(X, centroid) {
n <- nrow(X)
p <- ncol(X)
rowSums((X - matrix(centroid, n, p, byrow = TRUE))^2)
}
delta_matrix <- function(X, centroids) {
k <- nrow(centroids)
sapply(seq_len(k),function(i) delta_to_centroid(X, centroids[i, ]))
}
f_kmeans <- function(data, k, max_iter = 100, start = 10) {
set.seed(82171165)
X <- as.matrix(data)
n <- nrow(X)
p <- ncol(X)
centroids <- X[sample(n), , drop = FALSE][1:k, ] # k random centroids
clusters <- integer(n)
iter_reached <- NA
for (iter in seq_len(max_iter)) {
distances <- delta_matrix(X, centroids)
clusters_new <- max.col(-distances) #idx of first largest
has_converged <- !anyNA(clusters_new) && all(clusters_new == clusters)
# cat(iter, "of", max_iter, "\n"); flush.console()
if (has_converged) {
iter_reached <- iter
cat("converge @ iter_reached:", iter_reached, "\n")
break
}
clusters <- clusters_new
# recompute centroids
for (j in seq_len(k)) {
idx <- which(clusters == j)
if (length(idx) > 0) {centroids[j, ] <- colMeans(X[idx, , drop = FALSE])
}
}
}
list(clusters = clusters, # return values
centroids = centroids,
iter = iter_reached
)
}
compute_wss <- function(k, X, max_iter, start) {
res <- tryCatch(
f_kmeans(X, k, max_iter, start),
error = function(e) NULL
)
if (is.null(res) || is.null(res$tot.withinss)) {
return(NA_real_) # must return a length-1 numeric
}
return(as.numeric(res$tot.withinss)) # force numeric(1)
}
k_vals <- 2:10
unlabeled_use <- unlabeled_scaled
if (!exists("max_iter")) max_iter <- 100
if (!exists("start")) start <- 10
wss_values <- sapply( k_vals, function(k) compute_wss(k, unlabeled_use, max_iter, start))
## converge @ iter_reached: 8
## converge @ iter_reached: 8
## converge @ iter_reached: 38
## converge @ iter_reached: 25
## converge @ iter_reached: 22
## converge @ iter_reached: 25
## converge @ iter_reached: 23
## converge @ iter_reached: 18
## converge @ iter_reached: 57
testing <- FALSE
if (testing) {
message("🟡 Quick test: 500 rows, k=2:4, iter.max=5")
unlabeled_use <- unlabeled_scaled[1:500,]; k_vals <- 2:4; max_iter <- 5
} else {
message("🟢 Full run: all rows, k=2:10, iter.max=100")
unlabeled_use <- unlabeled_scaled; k_vals <- 2:10; max_iter <- 100
}
k_vals <- 2:10
unlabeled_use <- unlabeled_scaled
wss_values <- vapply(k_vals, function(k) compute_wss(k, unlabeled_use, max_iter, start), numeric(1))
## converge @ iter_reached: 8
## converge @ iter_reached: 8
## converge @ iter_reached: 38
## converge @ iter_reached: 25
## converge @ iter_reached: 22
## converge @ iter_reached: 25
## converge @ iter_reached: 23
## converge @ iter_reached: 18
## converge @ iter_reached: 57
unlabeled_use <- unlabeled_scaled
if (exists("wss_values") && any(is.finite(wss_values))) {
plot(k_vals, wss_values, type="b", pch=19,
xlab = "k",
ylab = expression(paste("Within-cluster ", sum((x[i] - c[k])^2))),
main = if (testing) "Elbow (testing)" else "Elbow (full)",
xaxt = "n")
## xlab="Number of clusters (k)",
# ylab=expression("Within-cluster " * sum((x[i]-c[k])^2)),
# main=if(testing) "Elbow (testing)" else "Elbow (full)", xaxt="n")
axis(1, at = k_vals)
# side=1,line=3,cex=0.8)
}
debug_tbl <- purrr::map_dfr(k_vals, function(k) {
res <- f_kmeans(unlabeled_scaled,k,max_iter)
tot_within <- compute_wss(k,unlabeled_use,max_iter)
grand_mean <- colMeans(unlabeled_use)
totss <- sum(rowSums((as.matrix(unlabeled_use) -
matrix(grand_mean,nrow(unlabeled_use),ncol(unlabeled_use),byrow=TRUE))^2))
betweenss <- totss - tot_within
tibble(k, tot_within, totss, betweenss,
identity_ok=abs(totss-(betweenss+tot_within))<1e-8)
})
## converge @ iter_reached: 8
## converge @ iter_reached: 8
## converge @ iter_reached: 8
## converge @ iter_reached: 8
## converge @ iter_reached: 38
## converge @ iter_reached: 38
## converge @ iter_reached: 25
## converge @ iter_reached: 25
## converge @ iter_reached: 22
## converge @ iter_reached: 22
## converge @ iter_reached: 25
## converge @ iter_reached: 25
## converge @ iter_reached: 23
## converge @ iter_reached: 23
## converge @ iter_reached: 18
## converge @ iter_reached: 18
## converge @ iter_reached: 57
## converge @ iter_reached: 57
print(debug_tbl)
## # A tibble: 9 × 5
## k tot_within totss betweenss identity_ok
## <int> <dbl> <dbl> <dbl> <lgl>
## 1 2 NA 216160 NA NA
## 2 3 NA 216160 NA NA
## 3 4 NA 216160 NA NA
## 4 5 NA 216160 NA NA
## 5 6 NA 216160 NA NA
## 6 7 NA 216160 NA NA
## 7 8 NA 216160 NA NA
## 8 9 NA 216160 NA NA
## 9 10 NA 216160 NA NA
unlabeled_use <- unlabeled_scaled
k_vals <- 2:10
max_iter <- 100
start <- 10
wss_values <- vapply(
k_vals,
function(k) compute_wss(k, unlabeled_use, max_iter, start),
numeric(1)
)
## converge @ iter_reached: 8
## converge @ iter_reached: 8
## converge @ iter_reached: 38
## converge @ iter_reached: 25
## converge @ iter_reached: 22
## converge @ iter_reached: 25
## converge @ iter_reached: 23
## converge @ iter_reached: 18
## converge @ iter_reached: 57
if (all(is.na(wss_values)) || all(!is.finite(wss_values))) {
warning("All clustering attempts failed. No elbow plot can be drawn.")
} else {
if (all(is.na(wss_values)) || all(!is.finite(wss_values))) {
warning("All clustering attempts failed. No elbow plot can be drawn.")
} else {
if (exists("wss_values") && any(is.finite(wss_values))) {
plot(
k_vals, wss_values, type="b", pch=19,
xlab = "k",
ylab = expression(paste("Within-cluster ", sum((x[i] - c[k])^2))),
main = if (testing) "Elbow (testing)" else "Elbow (full)",
xaxt = "n"
)
axis(1, at = k_vals)
}
}
}
5. Final Clustering
# Final clustering with user-selected k
k_best <- 7
res_final <- f_kmeans(
unlabeled_scaled,
k_best,
max_iter,
start
)
## converge @ iter_reached: 25
# Attach cluster assignment to unlabeled_use
unlabeled_use$cluster <- res_final$clusters
# Optional: also compute a label mapping using majority vote (saved separately)
if (exists("annotated")) {
label_col <- "Class"
annotated_subset <- annotated |> filter(!is.na(.data[[label_col]]))
# Assign clusters to annotated points (assumes same row indexing)
annotated_subset$cluster <- res_final$clusters[as.integer(rownames(annotated_subset))]
# Determine the most common class in each cluster
mapping <- annotated_subset |>
count(.data[[label_col]], cluster, name = "n") |>
group_by(cluster) |>
slice_max(n, n = 1, with_ties = FALSE) |>
ungroup()
cluster_map_vote <- setNames(as.character(mapping[[label_col]]), mapping$cluster)
# Assign vote-based labels separately
unlabeled_use$label_vote <- cluster_map_vote[as.character(unlabeled_use$cluster)]
}
# Print diagnostic output
res_final$iter
## [1] 25
table(unlabeled_use$cluster)
##
## 1 2 3 4 5 6 7
## 2011 515 3137 1763 2461 3089 535
# Use centroids of clusters and annotated class groups to align labels by proximity
# Get centroids of clusters (in scaled unlabeled data)
cluster_centroids <- unlabeled_scaled %>%
as.data.frame() %>%
mutate(cluster = res_final$clusters) %>%
group_by(cluster) %>%
summarise(across(everything(), mean), .groups = "drop")
# Get centroids of known classes (in scaled annotated data)
annotated_scaled <- annotated %>%
mutate(Class = as.factor(Class)) %>%
select(-Class) %>%
scale() %>%
as.data.frame()
annotated_centroids <- annotated_scaled %>%
mutate(Class = annotated$Class) %>%
group_by(Class) %>%
summarise(across(everything(), mean), .groups = "drop")
# Compute distances between each cluster centroid and class centroid
cluster_matrix <- as.matrix(select(cluster_centroids, -cluster))
class_matrix <- as.matrix(select(annotated_centroids, -Class))
distance_matrix <- as.matrix(dist(rbind(cluster_matrix, class_matrix)))
# Extract top-left block: distances between clusters and classes
k <- nrow(cluster_matrix)
m <- nrow(class_matrix)
prox_matrix <- distance_matrix[1:k, (k + 1):(k + m)]
# Assign each cluster to closest class
closest_class_index <- apply(prox_matrix, 1, which.min)
closest_class_labels <- annotated_centroids$Class[closest_class_index]
# Map cluster numbers to class labels
cluster_to_label <- setNames(as.character(closest_class_labels), cluster_centroids$cluster)
unlabeled_use$label <- cluster_to_label[as.character(unlabeled_use$cluster)]
pca_result <- PCA(unlabeled_scaled, graph = FALSE)
pca_coords <- as.data.frame(pca_result$ind$coord[, 1:3])
colnames(pca_coords) <- c("PC1", "PC2", "PC3")
# Use class labels instead of cluster numbers
pca_coords$Class <- unlabeled_use$label
# Color palette
pal <- brewer.pal(max(3, length(unique(pca_coords$Class))), "Set2")
# Plot
plot_ly(data = pca_coords,
x = ~PC1, y = ~PC2, z = ~PC3,
color = ~Class, colors = pal,
type = "scatter3d", mode = "markers",
marker = list(size = 3)) |>
layout(title = "PCA – Colored by Assigned Class Labels",
legend = list(title = list(text = "Class")),
scene = list(
xaxis = list(title = "PC1"),
yaxis = list(title = "PC2"),
zaxis = list(title = "PC3")
))
if (!exists("res.pca")||!exists("res.hcpc")) {
hcpc_data<-scale(unlabeled)
res.pca<-PCA(hcpc_data,graph=FALSE)
res.hcpc<-HCPC(res.pca,graph=FALSE)
}
pca_coords<-as.data.frame(res.pca$ind$coord[,1:3]);
colnames(pca_coords)<-c("PC1","PC2","PC3");
pca_coords$cluster <- factor(unlabeled_use$cluster)
pal<-brewer.pal(max(3,length(levels(pca_coords$cluster))),"Set2")
if (knitr::is_html_output()) {
plot_ly(data=pca_coords,x=~PC1,y=~PC2,z=~PC3,type="scatter3d",mode="markers",
color=~cluster,colors=pal,marker=list(size=4))|>
layout(scene=list(xaxis=list(title="PC1"),yaxis=list(title="PC2"),zaxis=list(title="PC3")),
legend=list(title=list(text="Cluster")))
} else {
fviz_cluster(res.hcpc,geom="point",repel=TRUE,show.clust.cent=TRUE,axes=c(1,2),palette=pal)
}
pca_coords2 <- as.data.frame(pca_result$ind$coord[, 1:3])
colnames(pca_coords2) <- c("PC1", "PC2", "PC3")
pca_coords2$label <- unlabeled_use$label_proximity
pal2 <- brewer.pal(max(3, length(unique(pca_coords2$label))), "Set2")
plot_ly(data = pca_coords2,
x = ~PC1, y = ~PC2, z = ~PC3,
color = ~label, colors = pal2,
type = "scatter3d", mode = "markers",
marker = list(size = 3)) |>
layout(title = "PCA - Colored by Class Label (Centroid Proximity)",
legend = list(title = list(text = "Class")),
scene = list(
xaxis = list(title = "PC1"),
yaxis = list(title = "PC2"),
zaxis = list(title = "PC3")
))
6. Classification of Annotated Data
# Ensure Class is a factor
annotated$Class <- as.factor(annotated$Class)
# Remove class temporarily, scale, then add back
ann <- select(annotated, -Class) |> scale() |> as_tibble()
ann$Class <- annotated$Class
# Create training/test split
set.seed(82171165)
idx <- createDataPartition(ann$Class, p = 0.7, list = FALSE)
train <- ann[idx, ]
test <- ann[-idx, ]
# Train classifier
model <- train(Class ~ ., data = train, method = "svmRadial")
# Predict and evaluate
pred <- predict(model, test)
confusionMatrix(pred, test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 1 0 0 0 0 0 0
## B 0 1 0 0 0 0 0
## C 0 0 4 0 1 0 0
## D 0 0 0 6 0 0 0
## E 0 0 0 0 1 0 0
## F 0 0 0 0 0 6 0
## G 1 0 0 0 0 0 6
##
## Overall Statistics
##
## Accuracy : 0.9259
## 95% CI : (0.7571, 0.9909)
## No Information Rate : 0.2222
## P-Value [Acc > NIR] : 1.014e-14
##
## Kappa : 0.9085
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 0.50000 1.00000 1.0000 1.0000 0.50000 1.0000
## Specificity 1.00000 1.00000 0.9565 1.0000 1.00000 1.0000
## Pos Pred Value 1.00000 1.00000 0.8000 1.0000 1.00000 1.0000
## Neg Pred Value 0.96154 1.00000 1.0000 1.0000 0.96154 1.0000
## Prevalence 0.07407 0.03704 0.1481 0.2222 0.07407 0.2222
## Detection Rate 0.03704 0.03704 0.1481 0.2222 0.03704 0.2222
## Detection Prevalence 0.03704 0.03704 0.1852 0.2222 0.03704 0.2222
## Balanced Accuracy 0.75000 1.00000 0.9783 1.0000 0.75000 1.0000
## Class: G
## Sensitivity 1.0000
## Specificity 0.9524
## Pos Pred Value 0.8571
## Neg Pred Value 1.0000
## Prevalence 0.2222
## Detection Rate 0.2222
## Detection Prevalence 0.2593
## Balanced Accuracy 0.9762
pca_result <- PCA(unlabeled_scaled, graph = FALSE)
pca_coords <- as.data.frame(pca_result$ind$coord[, 1:3])
colnames(pca_coords) <- c("PC1", "PC2", "PC3")
# Use Class labels derived from cluster mapping
pca_coords$Class <- unlabeled_use$label
# Color palette
pal <- brewer.pal(max(3, length(unique(pca_coords$Class))), "Set2")
# Plot
plot_ly(data = pca_coords,
x = ~PC1, y = ~PC2, z = ~PC3,
color = ~Class, colors = pal,
type = "scatter3d", mode = "markers",
marker = list(size = 3)) |>
layout(title = "PCA – Colored by Assigned Class Labels",
legend = list(title = list(text = "Class")),
scene = list(
xaxis = list(title = "PC1"),
yaxis = list(title = "PC2"),
zaxis = list(title = "PC3")
))
7. HCPC 3D Factor Map
if (!exists("res.pca")||!exists("res.hcpc")) {
hcpc_data<-scale(unlabeled)
res.pca<-PCA(hcpc_data,graph=FALSE)
res.hcpc<-HCPC(res.pca,graph=FALSE)
}
pca_coords<-as.data.frame(res.pca$ind$coord[,1:3]);
colnames(pca_coords)<-c("PC1","PC2","PC3");
pca_coords$cluster <- factor(unlabeled_use$cluster)
pal<-brewer.pal(max(3,length(levels(pca_coords$cluster))),"Set2")
if (knitr::is_html_output()) {
plot_ly(data=pca_coords,x=~PC1,y=~PC2,z=~PC3,type="scatter3d",mode="markers",
color=~cluster,colors=pal,marker=list(size=4))|>
layout(scene=list(xaxis=list(title="PC1"),yaxis=list(title="PC2"),zaxis=list(title="PC3")),
legend=list(title=list(text="Cluster")))
} else {
fviz_cluster(res.hcpc,geom="point",repel=TRUE,show.clust.cent=TRUE,axes=c(1,2),palette=pal)
}
# Combine cluster info with PCA coordinates
pca_clustered <- pca_coords |>
mutate(cluster = unlabeled_use$cluster)
# Select one cluster to highlight
target_cluster <- 3
highlight <- filter(pca_clustered, cluster == target_cluster)
# Plot just the selected cluster in 2D
ggplot(highlight, aes(x = PC1, y = PC2)) +
geom_point(color = "steelblue", alpha = 0.7) +
labs(
title = glue("Cluster {target_cluster}: PCA Projection"),
x = "PC1", y = "PC2"
) +
theme_minimal()

# Set which cluster to highlight
target_cluster <- 3
# Compute PCA coordinates if not already done
if (!exists("pca_coords")) {
pca_coords <- as.data.frame(res.pca$ind$coord[, 1:2])
colnames(pca_coords) <- c("PC1", "PC2")
}
# Add cluster assignment
pca_coords$cluster <- factor(res_final$clusters)
# Filter only points from the target cluster
highlight <- pca_coords |> filter(cluster == target_cluster)
# Plot only this cluster
ggplot(highlight, aes(x = PC1, y = PC2)) +
geom_point(color = "steelblue", size = 3, alpha = 0.6) +
labs(
title = glue("Points from Cluster {target_cluster}"),
x = "PC1", y = "PC2"
) +
theme_minimal()
